# 去除停用词

import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

set(stopwords.words('english'))
# 停用词集合
stop_words = set(stopwords.words('english'))
print(stop_words)

inputfile = r"..\data\microwave_6extract_word.tsv"
outputfile = r"..\data\microwave_7removed_stopwords.tsv"

m_data = pd.read_csv(inputfile, sep=',', encoding='utf-8')
m_data.columns = ['word', 'freq', 'form', '0']
# m_data.drop('0', axis=1)
del m_data['0']
m_data = m_data[~m_data.word.isin(stop_words)]

m_data.to_csv(outputfile, sep='\t', index=0, header=1, encoding='utf-8')
# #  分词
# words = nltk.word_tokenize(text)
# # 剔除停词表中的词
# filtered_words = []
# for w in words:
#     if w not in stop_words:
#         filtered_words.append(w)

m_data = m_data.head(50)
m_word = m_data['word'].values.tolist()#[40:90]
m_freq = m_data['freq'].values.tolist()#[40:90]

print(type(m_word))


plt.bar(x=m_word, height=m_freq, label='freq', color='steelblue', alpha=0.8)
plt.xticks(rotation=75)
# 设置标题
plt.title("word's frequency")
# 为两条坐标轴设置名称
plt.xlabel("word")
plt.ylabel("frequency")
# 显示图例
plt.legend()
plt.show()
